The dataset for this competition (both train and test) was generated from a deep learning model trained on the Amazon.com - Employee Access dataset. The goal is to predict whether or not access should be granted accroding to an employee's role information and a resource code.
There are 9 independent variables (including id):
Target varibale:
Metrics:
import pandas as pd
import numpy as np
from tqdm import tqdm
from itertools import combinations
import pickle
import warnings
warnings.filterwarnings("ignore")
# data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from plotly.offline import download_plotlyjs, init_notebook_mode , plot
from plotly.graph_objs import *
init_notebook_mode()
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import plot
import missingno as msno
# Modeling
from lightgbm import LGBMClassifier
from xgboost.sklearn import XGBClassifier
from catboost import CatBoost, CatBoostRegressor, CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier , ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score , precision_score , recall_score , f1_score , roc_auc_score
from sklearn.preprocessing import OneHotEncoder , StandardScaler
from sklearn import base
from sklearn.model_selection import KFold , StratifiedKFold , train_test_split
from category_encoders import OrdinalEncoder , LeaveOneOutEncoder
import optuna
from optuna import Trial
data = pd.read_csv('./train.csv')
data_test = pd.read_csv('./test.csv')
Y = data['ACTION']
X = data.drop('ACTION', axis = 1)
X_test = data_test.drop('id' , axis = 1)
def missing_values_table(data):
missing_value = data.isnull().sum()
missing_value_percent = data.isnull().sum() / data.shape[0] * 100
missing_value_table = pd.concat([missing_value , missing_value_percent] , axis = 1)
missing_value_table = missing_value_table.rename(columns = {0 : 'Missing Values' , 1 : '% of Total Values'})
missing_value_table.sort_values('% of Total Values' , ascending = False , inplace = True)
print('There are a total of {} columns, and {} features have missing values.'.format(len(missing_value_table) , (missing_value_table['% of Total Values'] != 0).sum()))
return missing_value_table.loc[missing_value_table['% of Total Values'] != 0]
missing_values_table(X)
missing_values_table(X_test)
msno.matrix(X).set_title('Distribution of Missing Values(Training Data)', fontsize = 20)
msno.matrix(X_test).set_title('Distribution of Missing Values(Testing Data)', fontsize = 20)
X_ = X.astype(str)
X_.info()
There are a total of 9 columns, and 0 features have missing values. There are a total of 9 columns, and 0 features have missing values. <class 'pandas.core.frame.DataFrame'> RangeIndex: 32769 entries, 0 to 32768 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 RESOURCE 32769 non-null object 1 MGR_ID 32769 non-null object 2 ROLE_ROLLUP_1 32769 non-null object 3 ROLE_ROLLUP_2 32769 non-null object 4 ROLE_DEPTNAME 32769 non-null object 5 ROLE_TITLE 32769 non-null object 6 ROLE_FAMILY_DESC 32769 non-null object 7 ROLE_FAMILY 32769 non-null object 8 ROLE_CODE 32769 non-null object dtypes: object(9) memory usage: 2.3+ MB
label_percentage = pd.DataFrame(Y.value_counts())
label_percentage['ACTION'] = label_percentage['ACTION'] / label_percentage['ACTION'].sum() * 100
fig = px.bar(x = label_percentage.index ,
y = label_percentage['ACTION'],
text = np.round(label_percentage['ACTION'] , 2),
color = pd.Series(label_percentage.index).astype(str),
color_discrete_sequence = px.colors.sequential.RdBu ,
template = 'plotly',
title = 'Label Percentage')
fig.update_traces(width = 0.7 , marker = dict(line = dict(color = '#000000' , width = 2)))
fig.update_layout(xaxis_title = 'Label',
yaxis_title = 'Percentage(%)',
font = dict(size=17 , family = 'Franklin Gothic'))
fig.show()
X_total = pd.concat([X , X_test] , axis = 0)
mask = np.array(X_total.corr())
mask[np.tril_indices_from(mask)] = False
fig , ax = plt.subplots()
fig.set_size_inches(20 , 8)
sns.heatmap(X_total.corr(),
mask = mask,
square = True,
annot = True,
fmt = '.1f',
linewidth = .3).set_title('Correlation Plot' , fontsize = 20)
Text(0.5, 1.0, 'Correlation Plot')
dimensions = [dict(label = col , values = X_total[col]) for col in X_total.columns]
fig = go.Figure(data = go.Splom(dimensions = dimensions ,
showupperhalf = False ,
marker = dict(showscale = False ,
line_color = 'black',
line_width = 0.5)))
fig.update_layout(title = 'Feature Pair Plit',
dragmode = 'select',
width = 1000 ,
height = 1000 ,
font = dict(size = 7 , family = 'Franklin Gothic') ,
hovermode = 'closest')
fig.show()
# Drop ROLE_CODE
X = X.drop('ROLE_CODE', axis = 1)
X_test = X_test.drop('ROLE_CODE' , axis = 1)
X_len = len(X)
X_total = pd.concat([X , X_test] , axis = 0)
X_total = X_total.reset_index(drop = True)
X_total = X_total.astype(str)
for col in X_total.columns:
diff_part = set(list(X_test[col])).difference(set(list(X[col])))
train_data_part = set(list(X[col]))
print('{} : {:.2%} of the categories in the testing set are not found in the training set.'.format(col , len(diff_part)/len(train_data_part)))
RESOURCE : 0.00% of the categories in the testing set are not found in the training set. MGR_ID : 15.79% of the categories in the testing set are not found in the training set. ROLE_ROLLUP_1 : 1.56% of the categories in the testing set are not found in the training set. ROLE_ROLLUP_2 : 3.39% of the categories in the testing set are not found in the training set. ROLE_DEPTNAME : 6.01% of the categories in the testing set are not found in the training set. ROLE_TITLE : 5.25% of the categories in the testing set are not found in the training set. ROLE_FAMILY_DESC : 25.15% of the categories in the testing set are not found in the training set. ROLE_FAMILY : 1.49% of the categories in the testing set are not found in the training set.
def filter_func(x , condition):
if x in condition:
return 999999999
else:
return x
for col in X_total.columns:
count = X_total[col].value_counts()
condition = list(count.loc[count <= 3].index)
X_total[col] = X_total[col].apply(lambda x : filter_func(x , condition))
X = X_total.iloc[:X_len , :]
X_test = X_total.iloc[X_len: , :]
for col in X_total.columns:
diff_part = set(list(X_test[col])).difference(set(list(X[col])))
train_data_part = set(list(X[col]))
print('{} : {:.2%} of the categories in the test set are not found in the training set.'.format(col , len(diff_part)/len(train_data_part)))
RESOURCE : 0.00% of the categories in the test set are not found in the training set. MGR_ID : 3.68% of the categories in the test set are not found in the training set. ROLE_ROLLUP_1 : 0.00% of the categories in the test set are not found in the training set. ROLE_ROLLUP_2 : 0.00% of the categories in the test set are not found in the training set. ROLE_DEPTNAME : 1.40% of the categories in the test set are not found in the training set. ROLE_TITLE : 0.31% of the categories in the test set are not found in the training set. ROLE_FAMILY_DESC : 4.18% of the categories in the test set are not found in the training set. ROLE_FAMILY : 1.49% of the categories in the test set are not found in the training set.
There has been a significant reduction in the instances of inconsistency.
As the features are randomly encoded and nominal in nature, arithmetic operations like addition or subtraction wouldn't provide substantial improvements. However, we can explore the possibility of concatenating these features together.
combination_num = n : using hashing to combine n features and create a new feature.
def concat_features_combination(df , cols , combination_num = 2):
combine_features = []
for idx in combinations(range(0 , len(cols)) , combination_num):
temp = []
for data in df[: , list(idx)]:
temp.append(hash(tuple(data)))
temp = pd.DataFrame(temp)
temp = temp.rename(columns = {0 : str(idx)})
combine_features.append(temp)
return pd.concat(combine_features , axis = 1)
X_total_twi = concat_features_combination(np.array(X_total) , X_total.columns)
X_total_tri = concat_features_combination(np.array(X_total) , X_total.columns , combination_num = 3)
X_total_twi = X_total_twi.astype(str)
X_total_tri = X_total_tri.astype(str)
# Ordinal encoding for twi feature
enc_twi = OrdinalEncoder().fit(X_total_twi)
X_total_twi = enc_twi.transform(X_total_twi)
X_total_twi = X_total_twi.reset_index(drop = True)
# Ordinal encoding for tri feature
enc_tri = OrdinalEncoder().fit(X_total_tri)
X_total_tri = enc_tri.transform(X_total_tri)
X_total_tri = X_total_tri.reset_index(drop = True)
# concat original , twi , tri dataset
X_total_categorical = pd.concat([X_total , X_total_twi , X_total_tri] , axis = 1)
features = X_total_categorical.columns
class KFoldTargetEncoderTrain(base.BaseEstimator , base.TransformerMixin):
def __init__(self , colnames , targetName , n_fold = 5 , verbosity = True , discardOriginal_col = False):
self.colnames = colnames
self.targetName = targetName
self.n_fold = n_fold
self.verbosity = verbosity
self.discardOriginal_col = discardOriginal_col
def fit(self , X , y = None):
return self
def transform(self , X):
assert(type(self.targetName) == str)
assert(type(self.colnames) == str)
assert(self.colnames in X.columns)
assert(self.targetName in X.columns)
mean_of_target = X[self.targetName].mean()
kf = KFold(n_splits = self.n_fold , shuffle = True , random_state = 2019)
col_mean_name = '{}_{}_{}'.format(self.colnames , self.n_fold , 'Kfold_Target')
X[col_mean_name] = np.nan
for tr_ind , val_ind in kf.split(X):
X_tr , X_val = X.iloc[tr_ind], X.iloc[val_ind]
X.loc[X.index[val_ind] , col_mean_name] = X_val[self.colnames].map(X_tr.groupby(self.colnames)[self.targetName].mean())
X[col_mean_name].fillna(mean_of_target , inplace = True)
if self.verbosity:
encoded_feature = X[col_mean_name].values
print('Correlation between the new feature, {} and, {} is {}.'.format(col_mean_name,
self.targetName,
np.corrcoef(X[self.targetName].values, encoded_feature)[0][1]))
if self.discardOriginal_col:
X = X.drop(self.targetName, axis=1)
return X
class KFoldTargetEncoderTest(base.BaseEstimator, base.TransformerMixin):
def __init__(self,train,colNames,encodedName):
self.train = train
self.colNames = colNames
self.encodedName = encodedName
def fit(self, X, y=None):
return self
def transform(self,X):
mean = self.train[[self.colNames, self.encodedName]].groupby(self.colNames).mean().reset_index()
dd = {}
for index, row in mean.iterrows():
dd[row[self.colNames]] = row[self.encodedName]
X[self.encodedName] = X[self.colNames]
X = X.replace({self.encodedName: dd})
return X
X_total_target_enc = X_total_categorical.copy(deep = True)
features = X_total_target_enc.columns
Y_total = pd.DataFrame(np.ones(len(X_total_target_enc)))
Y_total.iloc[:X_len , 0] = Y
X_total_target_enc = X_total_target_enc.reset_index(drop = True)
X_total_target_enc = pd.concat([X_total_target_enc , pd.DataFrame(Y_total).rename({0 : 'label'} , axis = 1)] , axis = 1)
for feature in features:
target_encoder = KFoldTargetEncoderTrain(feature , 'label' , n_fold = 5)
target_encoder.fit(X_total_target_enc)
X_total_target_enc = target_encoder.transform(X_total_target_enc)
X_total_target_enc = X_total_target_enc.drop(list(features) + ['label'] , axis = 1)
Correlation between the new feature, RESOURCE_5_Kfold_Target and, label is 0.0823148642547431. Correlation between the new feature, MGR_ID_5_Kfold_Target and, label is 0.2250278954524642. Correlation between the new feature, ROLE_ROLLUP_1_5_Kfold_Target and, label is 0.06795436758082077. Correlation between the new feature, ROLE_ROLLUP_2_5_Kfold_Target and, label is 0.08453415435977743. Correlation between the new feature, ROLE_DEPTNAME_5_Kfold_Target and, label is 0.13923375096413834. Correlation between the new feature, ROLE_TITLE_5_Kfold_Target and, label is 0.09651871525048646. Correlation between the new feature, ROLE_FAMILY_DESC_5_Kfold_Target and, label is 0.1454111527141569. Correlation between the new feature, ROLE_FAMILY_5_Kfold_Target and, label is 0.07005939813376397. Correlation between the new feature, (0, 1)_5_Kfold_Target and, label is 0.11574893897693105. Correlation between the new feature, (0, 2)_5_Kfold_Target and, label is 0.08945481687763406. Correlation between the new feature, (0, 3)_5_Kfold_Target and, label is 0.10327990012197943. Correlation between the new feature, (0, 4)_5_Kfold_Target and, label is 0.10418925378785274. Correlation between the new feature, (0, 5)_5_Kfold_Target and, label is 0.061788214657021834. Correlation between the new feature, (0, 6)_5_Kfold_Target and, label is 0.09333919727701726. Correlation between the new feature, (0, 7)_5_Kfold_Target and, label is 0.07658374933200206. Correlation between the new feature, (1, 2)_5_Kfold_Target and, label is 0.22514140721771986. Correlation between the new feature, (1, 3)_5_Kfold_Target and, label is 0.2326991151355611. Correlation between the new feature, (1, 4)_5_Kfold_Target and, label is 0.22970556993950905. Correlation between the new feature, (1, 5)_5_Kfold_Target and, label is 0.24937457941097568. Correlation between the new feature, (1, 6)_5_Kfold_Target and, label is 0.2500101676189511. Correlation between the new feature, (1, 7)_5_Kfold_Target and, label is 0.23424094198316003. Correlation between the new feature, (2, 3)_5_Kfold_Target and, label is 0.08478778149869978. Correlation between the new feature, (2, 4)_5_Kfold_Target and, label is 0.1794776990454261. Correlation between the new feature, (2, 5)_5_Kfold_Target and, label is 0.14610269534178477. Correlation between the new feature, (2, 6)_5_Kfold_Target and, label is 0.17068586068240732. Correlation between the new feature, (2, 7)_5_Kfold_Target and, label is 0.1178837805316116. Correlation between the new feature, (3, 4)_5_Kfold_Target and, label is 0.19321640715046423. Correlation between the new feature, (3, 5)_5_Kfold_Target and, label is 0.17862733385295196. Correlation between the new feature, (3, 6)_5_Kfold_Target and, label is 0.19258104468306. Correlation between the new feature, (3, 7)_5_Kfold_Target and, label is 0.1477429016872061. Correlation between the new feature, (4, 5)_5_Kfold_Target and, label is 0.21278751114134453. Correlation between the new feature, (4, 6)_5_Kfold_Target and, label is 0.23016705282294153. Correlation between the new feature, (4, 7)_5_Kfold_Target and, label is 0.17900558504175962. Correlation between the new feature, (5, 6)_5_Kfold_Target and, label is 0.1603677678133851. Correlation between the new feature, (5, 7)_5_Kfold_Target and, label is 0.09581704091963046. Correlation between the new feature, (6, 7)_5_Kfold_Target and, label is 0.1493812146064738. Correlation between the new feature, (0, 1, 2)_5_Kfold_Target and, label is 0.11332049114761855. Correlation between the new feature, (0, 1, 3)_5_Kfold_Target and, label is 0.11634111214814916. Correlation between the new feature, (0, 1, 4)_5_Kfold_Target and, label is 0.11730521170283507. Correlation between the new feature, (0, 1, 5)_5_Kfold_Target and, label is 0.12007799625017455. Correlation between the new feature, (0, 1, 6)_5_Kfold_Target and, label is 0.1456781524930761. Correlation between the new feature, (0, 1, 7)_5_Kfold_Target and, label is 0.12017481471371297. Correlation between the new feature, (0, 2, 3)_5_Kfold_Target and, label is 0.10550259862614064. Correlation between the new feature, (0, 2, 4)_5_Kfold_Target and, label is 0.1194861355037409. Correlation between the new feature, (0, 2, 5)_5_Kfold_Target and, label is 0.07911493419431552. Correlation between the new feature, (0, 2, 6)_5_Kfold_Target and, label is 0.10590554243278491. Correlation between the new feature, (0, 2, 7)_5_Kfold_Target and, label is 0.09563146703856197. Correlation between the new feature, (0, 3, 4)_5_Kfold_Target and, label is 0.12094664302248602. Correlation between the new feature, (0, 3, 5)_5_Kfold_Target and, label is 0.103761641685737. Correlation between the new feature, (0, 3, 6)_5_Kfold_Target and, label is 0.12604626086425572. Correlation between the new feature, (0, 3, 7)_5_Kfold_Target and, label is 0.11036265343285878. Correlation between the new feature, (0, 4, 5)_5_Kfold_Target and, label is 0.112428875368086. Correlation between the new feature, (0, 4, 6)_5_Kfold_Target and, label is 0.13034697520778576. Correlation between the new feature, (0, 4, 7)_5_Kfold_Target and, label is 0.11791337239109857. Correlation between the new feature, (0, 5, 6)_5_Kfold_Target and, label is 0.08256367610835795. Correlation between the new feature, (0, 5, 7)_5_Kfold_Target and, label is 0.06195678273066569. Correlation between the new feature, (0, 6, 7)_5_Kfold_Target and, label is 0.09753883188611989. Correlation between the new feature, (1, 2, 3)_5_Kfold_Target and, label is 0.23135794286889103. Correlation between the new feature, (1, 2, 4)_5_Kfold_Target and, label is 0.22683209403267657. Correlation between the new feature, (1, 2, 5)_5_Kfold_Target and, label is 0.2446388085817751. Correlation between the new feature, (1, 2, 6)_5_Kfold_Target and, label is 0.24697448406393765. Correlation between the new feature, (1, 2, 7)_5_Kfold_Target and, label is 0.22881820271713704. Correlation between the new feature, (1, 3, 4)_5_Kfold_Target and, label is 0.23236789126029797. Correlation between the new feature, (1, 3, 5)_5_Kfold_Target and, label is 0.2499369553052264. Correlation between the new feature, (1, 3, 6)_5_Kfold_Target and, label is 0.25142729852511453. Correlation between the new feature, (1, 3, 7)_5_Kfold_Target and, label is 0.23541045851711365. Correlation between the new feature, (1, 4, 5)_5_Kfold_Target and, label is 0.2514492944218385. Correlation between the new feature, (1, 4, 6)_5_Kfold_Target and, label is 0.25200789904764387. Correlation between the new feature, (1, 4, 7)_5_Kfold_Target and, label is 0.23824244590084756. Correlation between the new feature, (1, 5, 6)_5_Kfold_Target and, label is 0.25737624976443724. Correlation between the new feature, (1, 5, 7)_5_Kfold_Target and, label is 0.24936952310973268. Correlation between the new feature, (1, 6, 7)_5_Kfold_Target and, label is 0.2508029364402449. Correlation between the new feature, (2, 3, 4)_5_Kfold_Target and, label is 0.19262700172076383. Correlation between the new feature, (2, 3, 5)_5_Kfold_Target and, label is 0.17721211491142658. Correlation between the new feature, (2, 3, 6)_5_Kfold_Target and, label is 0.1915477637386253. Correlation between the new feature, (2, 3, 7)_5_Kfold_Target and, label is 0.1464934635785995. Correlation between the new feature, (2, 4, 5)_5_Kfold_Target and, label is 0.230790855630149. Correlation between the new feature, (2, 4, 6)_5_Kfold_Target and, label is 0.23749477536705593. Correlation between the new feature, (2, 4, 7)_5_Kfold_Target and, label is 0.20685614904573338. Correlation between the new feature, (2, 5, 6)_5_Kfold_Target and, label is 0.18163582434709938. Correlation between the new feature, (2, 5, 7)_5_Kfold_Target and, label is 0.14615848529898468. Correlation between the new feature, (2, 6, 7)_5_Kfold_Target and, label is 0.17260721382053046. Correlation between the new feature, (3, 4, 5)_5_Kfold_Target and, label is 0.23588785558146852. Correlation between the new feature, (3, 4, 6)_5_Kfold_Target and, label is 0.24298596582797227. Correlation between the new feature, (3, 4, 7)_5_Kfold_Target and, label is 0.21567360987082237. Correlation between the new feature, (3, 5, 6)_5_Kfold_Target and, label is 0.2075801892846929. Correlation between the new feature, (3, 5, 7)_5_Kfold_Target and, label is 0.17874409016827053. Correlation between the new feature, (3, 6, 7)_5_Kfold_Target and, label is 0.1934926897277154. Correlation between the new feature, (4, 5, 6)_5_Kfold_Target and, label is 0.2467152928518916. Correlation between the new feature, (4, 5, 7)_5_Kfold_Target and, label is 0.21278326378450932. Correlation between the new feature, (4, 6, 7)_5_Kfold_Target and, label is 0.23262818430434898. Correlation between the new feature, (5, 6, 7)_5_Kfold_Target and, label is 0.1602983709891469.
def category_freq(X):
X_new = X.copy()
for f in X_new.columns:
count_data = X_new[f].value_counts().reset_index()
count_data = count_data.rename(columns = {'index' : f , f : '{}_count'.format(f)})
X_new = pd.merge(left = X_new , right = count_data , on = f , how ='left')
return X_new
X_total_freq = category_freq(X_total_categorical)
X_total_freq = X_total_freq.drop(list(features) , axis = 1)
X_train_num = np.concatenate([X_total_freq , X_total_target_enc] , axis = 1)[:X_len , :]
X_test_num = np.concatenate([X_total_freq , X_total_target_enc] , axis = 1)[X_len: , :]
X_train_num = pd.DataFrame(X_train_num)
X_test_num = pd.DataFrame(X_test_num)
def cv_score(model , X , Y):
kf = StratifiedKFold(n_splits = 5 , shuffle = True , random_state = 10)
mean_auc = []
for train_index , test_index in kf.split(X , Y):
X_train = X.iloc[train_index , :]
y_train = Y.iloc[train_index]
X_test = X.iloc[test_index , :]
y_test = Y.iloc[test_index]
model.fit(X_train , y_train)
preds = model.predict_proba(X_test)[: , 1]
AUC = roc_auc_score(y_test , preds)
mean_auc.append(AUC)
print('AUC Score from model: {}'.format(np.mean(mean_auc)))
rf_base_model = RandomForestClassifier(n_estimators = 400,
min_samples_split = 10,
min_samples_leaf = 2,
max_features = 'auto',
max_depth = 20,
bootstrap = False,
n_jobs = -1 ,
class_weight = 'balanced')
ex_base_model = ExtraTreesClassifier(n_estimators = 400,
min_samples_split = 10,
min_samples_leaf = 2,
max_features = 'auto',
max_depth = 20,
bootstrap = False,
n_jobs = -1 ,
class_weight = 'balanced')
lgb_base_model = LGBMClassifier(n_jobs = -1)
xgb_base_model = XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.5,
early_stopping_rounds=None, enable_categorical=False,
eval_metric=None, gamma=0.3, gpu_id=-1, grow_policy='depthwise',
importance_type=None, interaction_constraints='',
learning_rate=0.1, max_bin=256, max_cat_to_onehot=4,
max_delta_step=0, max_depth=15, max_leaves=0, min_child_weight=1, monotone_constraints='()', n_estimators=100,
n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
reg_alpha=0, reg_lambda=1)
cat_base_model = CatBoostClassifier(class_weights = [20 , 1] , verbose = False)
cv_score(rf_base_model , X_train_num , Y)
cv_score(ex_base_model , X_train_num , Y)
cv_score(lgb_base_model , X_train_num , Y)
cv_score(xgb_base_model , X_train_num , Y)
cv_score(cat_base_model , X_train_num , Y)
AUC Score from model: 0.910357390065173 AUC Score from model: 0.9045941435407567 AUC Score from model: 0.9024192766450602 AUC Score from model: 0.9072390382146297 AUC Score from model: 0.9020717877478546
# def objective_cv_rf(trial , X = X_train_num , y = Y):
# global RANDOM_STATE
# print(RANDOM_STATE)
# param = {'bootstrap' : trial.suggest_categorical('bootstrap' , [True, False]) ,
# 'max_features' : trial.suggest_categorical('max_features' , ['auto' , 'sqrt']) ,
# 'criterion' : trial.suggest_categorical('criterion' , ['gini' , 'entropy']) ,
# 'n_estimators' : trial.suggest_int('n_estimators' , 100 , 1000) ,
# 'max_depth' : trial.suggest_int('max_depth' , 5 , 100) ,
# 'min_samples_split' : trial.suggest_int('min_samples_split' , 2 , 20),
# 'min_samples_leaf' : trial.suggest_int('min_samples_leaf' , 1 , 10)}
# kf = StratifiedKFold(n_splits = 5 , shuffle = True , random_state = 42)
# cv_scores = []
# for idx , (train_idx , test_idx) in enumerate(kf.split(X , y)):
# X_train , X_test = X.iloc[train_idx] , X.iloc[test_idx]
# y_train , y_test = y.iloc[train_idx] , y.iloc[test_idx]
# model = RandomForestClassifier(class_weight = 'balanced' , n_jobs = -1 , **param)
# model.fit(X_train , y_train)
# preds = model.predict_proba(X_test)[: , 1]
# AUC = roc_auc_score(y_test , preds)
# cv_scores.append(AUC)
# return -np.mean(cv_scores)
# rf_params_set = []
# for i in range(0 , 3):
# RANDOM_STATE = 42 + i
# study = optuna.create_study(direction = 'minimize')
# study.optimize(objective_cv_rf , n_trials = 100)
# print('Number of finished trials:{}'.format(len(study.trials)))
# rf_result = study.trials_dataframe()
# rf_params = study.best_params
# rf_params_set.append(rf_params)
# print('rf_{} completed !!'.format(i + 1))
# def objective_cv_ex(trial , X = X_train_num , y = Y):
# param = {'bootstrap' : trial.suggest_categorical('bootstrap' , [True, False]) ,
# 'max_features' : trial.suggest_categorical('max_features' , ['auto' , 'sqrt']) ,
# 'criterion' : trial.suggest_categorical('criterion' , ['gini' , 'entropy']) ,
# 'n_estimators' : trial.suggest_int('n_estimators' , 100 , 1000) ,
# 'max_depth' : trial.suggest_int('max_depth' , 5 , 100) ,
# 'min_samples_split' : trial.suggest_int('min_samples_split' , 2 , 20),
# 'min_samples_leaf' : trial.suggest_int('min_samples_leaf' , 1 , 10)}
# kf = StratifiedKFold(n_splits = 5 , shuffle = True , random_state = 42)
# cv_scores = []
# for idx , (train_idx , test_idx) in enumerate(kf.split(X , y)):
# X_train , X_test = X.iloc[train_idx] , X.iloc[test_idx]
# y_train , y_test = y.iloc[train_idx] , y.iloc[test_idx]
# model = ExtraTreesClassifier(class_weight = 'balanced' , n_jobs = -1 , **param)
# model.fit(X_train , y_train)
# preds = model.predict_proba(X_test)[: , 1]
# AUC = roc_auc_score(y_test , preds)
# cv_scores.append(AUC)
# return -np.mean(cv_scores)
# ex_params_set = []
# for i in range(0 , 3):
# RANDOM_STATE = 42 + i
# study = optuna.create_study(direction = 'minimize')
# study.optimize(objective_cv_ex , n_trials = 100)
# print('Number of finished trials:{}'.format(len(study.trials)))
# ex_result = study.trials_dataframe()
# ex_params = study.best_params
# ex_params_set.append(ex_params)
# print('ex_{} completed !!'.format(i + 1))
# def objective_cv_lgb(trial , X = X_train_num , y = Y):
# param = {'learning_rate' : trial.suggest_loguniform('learning_rate' , 0.006 , 0.02),
# 'reg_alpha' : trial.suggest_loguniform('reg_alpha' , 1e-9 , 10.0) , # L1 regularization
# 'reg_lambda' : trial.suggest_loguniform('reg_lambda' , 1e-9 , 10.0) , # L2 regularization
# 'colsample_bytree' : trial.suggest_categorical('colsample_bytree' , [i/10 for i in range(3 , 11)]) , # deal with over-fitting
# 'subsample' : trial.suggest_categorical('subsample' , [i/10 for i in range(4 , 11)]) , # deal with over-fitting
# 'n_estimators' : trial.suggest_int('n_estimators' , 100 , 20000) ,
# 'max_depth' : trial.suggest_int('max_depth' , 5 , 100),
# 'num_leaves' : trial.suggest_int('num_leaves' , 5 , 1000),
# 'min_child_samples' : trial.suggest_int('min_child_samples' , 1 , 300), # deal with over-fitting
# 'min_child_weight' : trial.suggest_int('min_child_samples' , 0.01 , 10), # deal with over-fitting
# 'cat_smooth' : trial.suggest_int('cat_smooth' , 1 , 100)}
# kf = StratifiedKFold(n_splits = 5 , shuffle = True , random_state = 42)
# cv_scores = []
# for idx , (train_idx , test_idx) in enumerate(kf.split(X , y)):
# X_train , X_test = X.iloc[train_idx] , X.iloc[test_idx]
# y_train , y_test = y.iloc[train_idx] , y.iloc[test_idx]
# model = LGBMClassifier(objective = 'binary' , metric = 'auc' , class_weight = 'balanced' , **param)
# model.fit(X_train , y_train , eval_set = [(X_test , y_test)] , early_stopping_rounds = 500 , verbose = False)
# preds = model.predict_proba(X_test)[: , 1]
# AUC = roc_auc_score(y_test , preds)
# cv_scores.append(AUC)
# return -np.mean(cv_scores)
# lgb_params_set = []
# for i in range(0 , 3):
# RANDOM_STATE = 42 + i
# study = optuna.create_study(direction = 'minimize')
# study.optimize(objective_cv_lgb , n_trials = 100)
# print('Number of finished trials:{}'.format(len(study.trials)))
# lgb_result = study.trials_dataframe()
# lgb_params = study.best_params
# lgb_params_set.append(lgb_params)
# def objective_cv_xgb(trial , X = X_train_num , y = Y):
# param = {'scale_pos_weight' : trial.suggest_uniform('scale_pos_weight' , 1/20 , 1/16) ,
# 'learning_rate' : trial.suggest_loguniform('learning_rate' , 0.006 , 0.02),
# 'lambda' : trial.suggest_loguniform('lambda' , 1e-3 , 10.0),
# 'alpha' : trial.suggest_loguniform('alpha' , 1e-3 , 10.0),
# 'reg_alpha' : trial.suggest_loguniform('reg_alpha' , 1e-3 , 10.0) ,
# 'reg_lambda' : trial.suggest_loguniform('reg_lambda' , 1e-3 , 10.0) ,
# 'colsample_bytree' : trial.suggest_categorical('colsample_bytree' , [i/10 for i in range(3 , 11)]) ,
# 'subsample' : trial.suggest_categorical('subsample' , [i/10 for i in range(4 , 11)]) ,
# 'n_estimators' : trial.suggest_int('n_estimators' , 100 , 20000) ,
# 'max_depth' : trial.suggest_int('max_depth' , 10 , 100),
# 'max_leaves' : trial.suggest_int('max_leaves' , 10 , 1000),
# 'min_child_weight' : trial.suggest_int('min_child_weight', 1 , 300)}
# kf = StratifiedKFold(n_splits = 5 , shuffle = True , random_state = 42)
# cv_scores = []
# for idx , (train_idx , test_idx) in enumerate(kf.split(X , y)):
# X_train , X_test = X.iloc[train_idx] , X.iloc[test_idx]
# y_train , y_test = y.iloc[train_idx] , y.iloc[test_idx]
# model = XGBClassifier(objective = 'binary:logistic' , eval_metric = 'auc' , **param)
# model.fit(X_train , y_train , eval_set = [(X_test , y_test)] , early_stopping_rounds = 500 , verbose = False)
# preds = model.predict(X_test)
# AUC = roc_auc_score(y_test , preds)
# cv_scores.append(AUC)
# return -np.mean(cv_scores)
# xgb_params_set = []
# for i in range(0 , 3):
# RANDOM_STATE = 42 + i
# study = optuna.create_study(direction = 'minimize')
# study.optimize(objective_cv_xgb , n_trials = 100)
# print('Number of finished trials:{}'.format(len(study.trials)))
# xgb_result = study.trials_dataframe()
# xgb_params = study.best_params
# xgb_params['objective'] = 'binary:logistic'
# xgb_params['eval_metric'] = 'auc'
# xgb_params_set.append(xgb_params)
# def objective_cv_cat(trial ,X = X_train_num , y = Y):
# param = {'l2_leaf_reg' : trial.suggest_loguniform('l2_leaf_reg' , 1e-3 , 10.0),
# # 'rsm' : trial.suggest_uniform('rsm' , 0.3 , 1.0),
# 'subsample' : trial.suggest_uniform('subsample' , 0.4 , 1.0),
# 'learning_rate' : trial.suggest_uniform('learning_rate' , 0.006 , 0.018),
# 'n_estimators' : trial.suggest_int('n_estimators' , 100 , 20000) ,
# 'max_depth' : trial.suggest_int('max_depth' , 5 , 15),
# 'max_bin' : trial.suggest_int('max_bin' , 200 , 400),
# 'min_data_in_leaf' : trial.suggest_int('min_data_in_leaf' , 1 , 300) ,
# 'class_weights' : [trial.suggest_int('class_weights' , 16 , 20) , 1]}
# kf = StratifiedKFold(n_splits = 5 , shuffle = True , random_state = 42)
# cv_scores = []
# for idx , (train_idx , test_idx) in enumerate(kf.split(X , y)):
# X_train , X_test = X.iloc[train_idx] , X.iloc[test_idx]
# y_train , y_test = y.iloc[train_idx] , y.iloc[test_idx]
# model = CatBoostClassifier(loss_function = 'logloss' , eval_metric = 'AUC' , **param)
# model.fit(X_train , y_train , eval_set = [(X_test , y_test)] , early_stopping_rounds = 500 , verbose = False)
# preds = model.predict(X_test)
# AUC = roc_auc_score(y_test , preds)
# cv_scores.append(AUC)
# return -np.mean(cv_scores)
# cat_params_set = []
# for i in range(0 , 3):
# RANDOM_STATE = 42 + i
# study = optuna.create_study(direction = 'minimize')
# study.optimize(objective_cv_cat , n_trials = 100)
# print('Number of finished trials:{}'.format(len(study.trials)))
# cat_results = study.trials_dataframe()
# cat_params = study.best_params
# cat_params_set.append(cat_params)
models = [RandomForestClassifier(n_estimators = 400,
min_samples_split = 10,
min_samples_leaf = 2,
max_features = 'auto',
max_depth = 20,
bootstrap = False,
n_jobs = -1 ,
class_weight = 'balanced') ,
ExtraTreesClassifier(n_estimators = 400,
min_samples_split = 10,
min_samples_leaf = 2,
max_features = 'auto',
max_depth = 20,
bootstrap = False,
n_jobs = -1 ,
class_weight = 'balanced') ,
LGBMClassifier(n_jobs = -1) ,
XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.5,
early_stopping_rounds=None, enable_categorical=False,
eval_metric=None, gamma=0.3, gpu_id=-1, grow_policy='depthwise',
importance_type=None, interaction_constraints='',
learning_rate=0.1, max_bin=256, max_cat_to_onehot=4,
max_delta_step=0, max_depth=15, max_leaves=0, min_child_weight=1, monotone_constraints='()', n_estimators=100,
n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
reg_alpha=0, reg_lambda=1) ,
(class_weights = [20 , 1] , verbose = False)]
def stacking_generate_feature(model , df_train , label , df_test):
kf = StratifiedKFold(n_splits = 5 , shuffle = True , random_state = 42)
first_feature_train , first_feature_test , first_label_train = [] , [] , []
mean_auc = []
for idx , (train_idx , test_idx) in enumerate(kf.split(df_train , label)):
X_train , X_test = df_train.iloc[train_idx] , df_train.iloc[test_idx]
y_train , y_test = label.iloc[train_idx] , label.iloc[test_idx]
model.fit(X_train , y_train)
first_feature_train.append(model.predict_proba(X_test))
first_label_train.append(np.array(y_test))
if idx == 0:
first_feature_test = model.predict_proba(df_test)
else:
first_feature_test += model.predict_proba(df_test)
AUC = roc_auc_score(y_test , first_feature_train[-1][: , 1])
mean_auc.append(AUC)
print('AUC Score from model: {}'.format(np.mean(mean_auc)))
return np.vstack(first_feature_train) , np.hstack(first_label_train) , first_feature_test / (idx + 1)
first_feature_train_list = []
first_feature_test_list = []
for model in models:
first_feature_train , first_label_train , first_feature_test = stacking_generate_feature(model , X_train_num , Y , X_test_num)
first_feature_train_list.append(first_feature_train)
first_feature_test_list.append(first_feature_test)
first_feature_train_list = np.hstack(first_feature_train_list)
first_feature_test_list = np.hstack(first_feature_test_list)
# standardization(Since Utilizing linear model to fit)
first_feature_train_list = (first_feature_train_list - np.mean(first_feature_train_list , axis = 0)) / np.std(first_feature_train_list , axis = 0)
first_feature_test_list = (first_feature_test_list - np.mean(first_feature_test_list , axis = 0)) / np.std(first_feature_test_list , axis = 0)
AUC Score from model: 0.9115797547523599 AUC Score from model: 0.9056464301724725 AUC Score from model: 0.9046406849362534 AUC Score from model: 0.9097457458212356 AUC Score from model: 0.9070093177450431
def objective_cv_log(trial , X = first_feature_train_list , y = first_label_train):
param = {'C' : trial.suggest_float('C' , 0.5 , 2.)}
kf = StratifiedKFold(n_splits = 5 , shuffle = True , random_state = 42)
cv_scores = []
for idx , (train_idx , test_idx) in enumerate(kf.split(X , y)):
X_train , X_test = X[train_idx , :] , X[test_idx , :]
y_train , y_test = y[train_idx] , y[test_idx]
model = LogisticRegression(class_weight = 'balanced' , **param)
model.fit(X_train , y_train)
y_pred_prob = model.predict_proba(X_test)[: , 1]
AUC = roc_auc_score(y_test , y_pred_prob)
cv_scores.append(AUC)
return -np.mean(cv_scores)
study = optuna.create_study(direction = 'minimize')
study.optimize(objective_cv_log , n_trials = 100)
print('Number of finished trials:{}'.format(len(study.trials)))
log_result = study.trials_dataframe()
log_params = study.best_params
second_model = LogisticRegression(class_weight = 'balanced' , **study.best_params)
second_model.fit(first_feature_train_list , first_label_train)
y_pred_prob = second_model.predict_proba(first_feature_test_list)[: , 1]
submission_df = pd.read_csv('./sampleSubmission.csv' , index_col = 'Id')
submission_df['Action'] = y_pred_prob
submission_df.to_csv('submission_summit_.csv')
[I 2023-06-04 16:26:19,389] A new study created in memory with name: no-name-e3fcc086-e17e-4e23-99e8-65e758a9d422 [I 2023-06-04 16:26:19,689] Trial 0 finished with value: -0.9113222537548369 and parameters: {'C': 0.6310087407353967}. Best is trial 0 with value: -0.9113222537548369. [I 2023-06-04 16:26:19,979] Trial 1 finished with value: -0.911320546687518 and parameters: {'C': 0.7312040586792805}. Best is trial 0 with value: -0.9113222537548369. [I 2023-06-04 16:26:20,276] Trial 2 finished with value: -0.9113119245742931 and parameters: {'C': 1.9685066930138557}. Best is trial 0 with value: -0.9113222537548369. [I 2023-06-04 16:26:20,562] Trial 3 finished with value: -0.9113141436434615 and parameters: {'C': 1.4691018427035274}. Best is trial 0 with value: -0.9113222537548369. [I 2023-06-04 16:26:20,853] Trial 4 finished with value: -0.9113136315274184 and parameters: {'C': 1.5844622285536631}. Best is trial 0 with value: -0.9113222537548369. [I 2023-06-04 16:26:21,143] Trial 5 finished with value: -0.9113120957433354 and parameters: {'C': 1.9313012243657508}. Best is trial 0 with value: -0.9113222537548369. [I 2023-06-04 16:26:21,433] Trial 6 finished with value: -0.911315252925436 and parameters: {'C': 1.3571666492352807}. Best is trial 0 with value: -0.9113222537548369. [I 2023-06-04 16:26:21,749] Trial 7 finished with value: -0.9113136315274184 and parameters: {'C': 1.5807700816428405}. Best is trial 0 with value: -0.9113222537548369. [I 2023-06-04 16:26:22,031] Trial 8 finished with value: -0.911315765491332 and parameters: {'C': 1.3094065689565595}. Best is trial 0 with value: -0.9113222537548369. [I 2023-06-04 16:26:22,312] Trial 9 finished with value: -0.9113161924571352 and parameters: {'C': 1.132160232969179}. Best is trial 0 with value: -0.9113222537548369. [I 2023-06-04 16:26:22,612] Trial 10 finished with value: -0.9113260943915827 and parameters: {'C': 0.5174727076731833}. Best is trial 10 with value: -0.9113260943915827. [I 2023-06-04 16:26:22,909] Trial 11 finished with value: -0.9113266069713204 and parameters: {'C': 0.5003389698319655}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:23,215] Trial 12 finished with value: -0.9113265214992625 and parameters: {'C': 0.5130637245769243}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:23,502] Trial 13 finished with value: -0.9113196931192548 and parameters: {'C': 0.7900753391922833}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:23,797] Trial 14 finished with value: -0.9113176448661671 and parameters: {'C': 0.916344505086474}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:24,104] Trial 15 finished with value: -0.9113266067602357 and parameters: {'C': 0.5011422183491755}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:24,415] Trial 16 finished with value: -0.9113175591968659 and parameters: {'C': 0.9483439664215729}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:24,735] Trial 17 finished with value: -0.9113212292978241 and parameters: {'C': 0.6737123831133285}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:25,037] Trial 18 finished with value: -0.91132635058283 and parameters: {'C': 0.5067095342895389}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:25,338] Trial 19 finished with value: -0.9113188402534538 and parameters: {'C': 0.8642601716852895}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:25,654] Trial 20 finished with value: -0.9113171320753445 and parameters: {'C': 1.030977932632654}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:25,965] Trial 21 finished with value: -0.9113239600054994 and parameters: {'C': 0.5800770849191539}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:26,276] Trial 22 finished with value: -0.9113209731204185 and parameters: {'C': 0.689872991611398}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:26,592] Trial 23 finished with value: -0.9113262653356985 and parameters: {'C': 0.5053869381241082}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:26,905] Trial 24 finished with value: -0.911319948860649 and parameters: {'C': 0.768038141075261}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:27,200] Trial 25 finished with value: -0.9113223390019684 and parameters: {'C': 0.6282499148623646}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:27,518] Trial 26 finished with value: -0.9113201197770815 and parameters: {'C': 0.7769826763703915}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:27,814] Trial 27 finished with value: -0.9113232778035215 and parameters: {'C': 0.6090900608595311}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:28,141] Trial 28 finished with value: -0.9113194374055436 and parameters: {'C': 0.8353449728177796}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:28,438] Trial 29 finished with value: -0.9113221682827788 and parameters: {'C': 0.6320849133452482}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:28,795] Trial 30 finished with value: -0.9113209729093338 and parameters: {'C': 0.6841421805909833}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:29,143] Trial 31 finished with value: -0.9113265210909345 and parameters: {'C': 0.5032709803851431}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:29,574] Trial 32 finished with value: -0.9113243866771679 and parameters: {'C': 0.5673451100473188}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:29,872] Trial 33 finished with value: -0.9113265210909345 and parameters: {'C': 0.5031839080498425}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:30,158] Trial 34 finished with value: -0.9113204614403866 and parameters: {'C': 0.7231839499936543}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:30,469] Trial 35 finished with value: -0.9113235337698422 and parameters: {'C': 0.5977890919605309}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:30,775] Trial 36 finished with value: -0.9113209729093338 and parameters: {'C': 0.699089527654217}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:31,073] Trial 37 finished with value: -0.9113235339809271 and parameters: {'C': 0.6050310320816239}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:31,367] Trial 38 finished with value: -0.9113203757434022 and parameters: {'C': 0.7543645693790858}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:31,659] Trial 39 finished with value: -0.9113247284961913 and parameters: {'C': 0.5482568124125494}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:31,964] Trial 40 finished with value: -0.9113213997782452 and parameters: {'C': 0.6592756977580569}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:32,257] Trial 41 finished with value: -0.9113266067602357 and parameters: {'C': 0.500902990818921}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:32,534] Trial 42 finished with value: -0.9113243866771679 and parameters: {'C': 0.5672957714946735}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:32,813] Trial 43 finished with value: -0.91132635058283 and parameters: {'C': 0.5055084252017243}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:33,100] Trial 44 finished with value: -0.9113218266333153 and parameters: {'C': 0.6422576488772596}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:33,394] Trial 45 finished with value: -0.911324472135384 and parameters: {'C': 0.568047637222003}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:33,716] Trial 46 finished with value: -0.9113205466875179 and parameters: {'C': 0.7185845428781317}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:34,047] Trial 47 finished with value: -0.9113195228637597 and parameters: {'C': 0.8283850768983458}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:34,382] Trial 48 finished with value: -0.911321826647157 and parameters: {'C': 0.6457921764711271}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:34,664] Trial 49 finished with value: -0.9113265210909345 and parameters: {'C': 0.5035975285093657}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:34,952] Trial 50 finished with value: -0.9113243864383997 and parameters: {'C': 0.5586125466973777}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:35,249] Trial 51 finished with value: -0.9113266065491507 and parameters: {'C': 0.5017331573816065}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:35,549] Trial 52 finished with value: -0.9113238745472831 and parameters: {'C': 0.5754108963175381}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:35,837] Trial 53 finished with value: -0.911320546687518 and parameters: {'C': 0.7285334964606038}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:36,135] Trial 54 finished with value: -0.9113217414000255 and parameters: {'C': 0.6511759256634403}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:36,424] Trial 55 finished with value: -0.9113248139682494 and parameters: {'C': 0.5394402155119704}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:36,719] Trial 56 finished with value: -0.9113230220621272 and parameters: {'C': 0.6109570420057877}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:36,999] Trial 57 finished with value: -0.91132635058283 and parameters: {'C': 0.5094877480628474}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:37,297] Trial 58 finished with value: -0.911319778366386 and parameters: {'C': 0.7815967090968063}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:37,602] Trial 59 finished with value: -0.9113210585786348 and parameters: {'C': 0.6927123939284848}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:37,921] Trial 60 finished with value: -0.9113162777042667 and parameters: {'C': 1.15610445090506}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:38,214] Trial 61 finished with value: -0.91132635058283 and parameters: {'C': 0.5070721326312807}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:38,543] Trial 62 finished with value: -0.9113245573548323 and parameters: {'C': 0.5565963167342616}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:38,843] Trial 63 finished with value: -0.9113235335587573 and parameters: {'C': 0.5946766578256364}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:39,208] Trial 64 finished with value: -0.91132635058283 and parameters: {'C': 0.5069122570204723}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:39,578] Trial 65 finished with value: -0.9113229366039108 and parameters: {'C': 0.6157784362669595}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:39,939] Trial 66 finished with value: -0.9113211440506926 and parameters: {'C': 0.6686861187742725}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:40,320] Trial 67 finished with value: -0.9113248994264656 and parameters: {'C': 0.5372926898302767}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:40,685] Trial 68 finished with value: -0.9113246428130486 and parameters: {'C': 0.5535093839515152}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:41,029] Trial 69 finished with value: -0.9113229368149959 and parameters: {'C': 0.6138433096899986}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:41,450] Trial 70 finished with value: -0.911320546687518 and parameters: {'C': 0.7297560158354471}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:41,837] Trial 71 finished with value: -0.9113252408510025 and parameters: {'C': 0.5324741361421935}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:42,170] Trial 72 finished with value: -0.9113237040530203 and parameters: {'C': 0.5904324471919259}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:42,484] Trial 73 finished with value: -0.91132635058283 and parameters: {'C': 0.5069373953099776}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:42,768] Trial 74 finished with value: -0.911321314769882 and parameters: {'C': 0.6679364785641186}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:43,067] Trial 75 finished with value: -0.91132635058283 and parameters: {'C': 0.5058858735759584}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:43,346] Trial 76 finished with value: -0.9113220828107209 and parameters: {'C': 0.6346498821994115}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:43,657] Trial 77 finished with value: -0.9113238745472831 and parameters: {'C': 0.5764625716753704}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:43,941] Trial 78 finished with value: -0.9113248139682494 and parameters: {'C': 0.5420306801124141}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:44,232] Trial 79 finished with value: -0.91132105836755 and parameters: {'C': 0.6865731504927213}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:44,556] Trial 80 finished with value: -0.9113235335587573 and parameters: {'C': 0.5941259633204272}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:44,864] Trial 81 finished with value: -0.9113266065491507 and parameters: {'C': 0.5023964494789035}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:45,148] Trial 82 finished with value: -0.9113248139682494 and parameters: {'C': 0.5408918487123474}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:45,449] Trial 83 finished with value: -0.9113224244740262 and parameters: {'C': 0.6230000989000969}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:45,752] Trial 84 finished with value: -0.9113265210909345 and parameters: {'C': 0.5037408467059455}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:46,049] Trial 85 finished with value: -0.9113241307246888 and parameters: {'C': 0.5734941819456281}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:46,344] Trial 86 finished with value: -0.9113248139682494 and parameters: {'C': 0.5438860340508017}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:46,648] Trial 87 finished with value: -0.9113218266333153 and parameters: {'C': 0.6431256503378568}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:46,957] Trial 88 finished with value: -0.9113236188058889 and parameters: {'C': 0.5922275513805937}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:47,267] Trial 89 finished with value: -0.9113248139682494 and parameters: {'C': 0.5397457516516793}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:47,577] Trial 90 finished with value: -0.911320290285186 and parameters: {'C': 0.7523590969438094}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:47,871] Trial 91 finished with value: -0.9113266069713204 and parameters: {'C': 0.5005526005063408}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:48,153] Trial 92 finished with value: -0.9113262651246137 and parameters: {'C': 0.50509445557397}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:48,447] Trial 93 finished with value: -0.9113240452526308 and parameters: {'C': 0.5816817598745608}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:48,740] Trial 94 finished with value: -0.9113247287211179 and parameters: {'C': 0.5442514488529283}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:49,053] Trial 95 finished with value: -0.9113229366039108 and parameters: {'C': 0.6169053400205441}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:49,359] Trial 96 finished with value: -0.9113242161829049 and parameters: {'C': 0.5700182071363037}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:49,673] Trial 97 finished with value: -0.9113251553927864 and parameters: {'C': 0.529489993825656}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:49,982] Trial 98 finished with value: -0.9113213995533188 and parameters: {'C': 0.660396463482948}. Best is trial 11 with value: -0.9113266069713204. [I 2023-06-04 16:26:50,279] Trial 99 finished with value: -0.9113209731204185 and parameters: {'C': 0.691191336767315}. Best is trial 11 with value: -0.9113266069713204.
Number of finished trials:100
log_model = LogisticRegression(class_weight = 'balanced' , **study.best_params )
log_model.fit(first_feature_train_list , first_label_train)
y_pred_prob = log_model.predict_proba(first_feature_test_list)[: , 1]
submission_df = pd.read_csv('./sampleSubmission.csv' , index_col = 'Id')
submission_df['Action'] = y_pred_prob
submission_df.to_csv('submission_summit.csv')